# steps 4,5, 6 use euclidean distance
library(plotly)
library(seriation)
library(scales)
library(dplyr)
library(tidyr)

Question 1.

# keep columns 1,2,5,6,7,9,10,16,17,18,19

p_e <- prices_earnings[, c(1,2,5,6,7,9,10,16,17,18,19)]

rownames(p_e) <- p_e[[1]]

Question 2.

Without doing any reordering We cannot identify any clusters or outliers.

#p_e_sc %>% 
  plot_ly(x =~colnames(p_e_sc), y =~rownames(p_e_sc),
    z = ~p_e_sc, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )

Question 3.

# seriation needs to permute rows and columns, thus distance by row and column
p_e_rdist <- dist(p_e_sc, method = "euclidean")

p_e_cdist <- dist(t(p_e_sc), method = "euclidean")

We used method “OLO” as the Hierarchical Clustering algorithm instead of “HC” method, because according to the documentation in seriation package and @[hahsler] the former does not optimize the Hamiltonian Path Length.

# make sure that results are reproducible
set.seed(1011)
# get orders of the row and col distances; Hamilton path length
order1 <- get_order(seriate(p_e_rdist, method = "OLO"))

order2 <-get_order(seriate(p_e_cdist, method = "OLO"))

p_reord <- p_e_sc[rev(order1), order2]
plot_ly(x =~colnames(p_reord), y =~rownames(p_reord),
    z = ~p_reord, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Euclid dist - HC)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )
# computing distance as one minus correlation

p_e_cor <- as.dist((1 - cor(p_e_sc))/2)

p_e_cor1 <- as.dist((1 - cor(t(p_e_sc)))/2)
# set seed to ensure results are reproducible
set.seed(1212)

# get orders for columns and rows
ord1 <- get_order(seriate(p_e_cor, method = "OLO"))

ord2 <- get_order(seriate(p_e_cor1, method = "OLO"))

# reorder
p_reord2 <- p_e_sc[rev(ord2), ord1]
plot_ly(x =~colnames(p_reord2), y =~rownames(p_reord2),
    z = ~p_reord2, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Cor dist)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )

The ordering by euclidean distance produces a heat map that is easier to analyze. At first glance we can perceive four general regions of two groups. The first group heat map color tends towards a brighter shade of red while the second group tend towards a darker shade of red/black. Although these groups can be seen in the correlation distance heat map, it is not as clear as the first.

Based on the euclidean distance heat map, net wage tends to higher values from Dubai while the number of hours worked decrease. This is the opposite to cities like Delhi,Bankok and Seoul. Interestingly food costs are generally low in the cities with highee working hours. Caracas is an outlier because food costs are high while net wage and the number of hours worked remains low.

Question 4.

# use p_e_rdist and p_e_cdist (euclidean distance)
# set seed
set.seed(11)

ord_q4_1 <- get_order(seriate(p_e_rdist, method = "TSP"))

set.seed(111)
order_q4_2 <-get_order(seriate(p_e_cdist, method = "TSP"))

p_reord_q4 <- p_e_sc[rev(ord_q4_1), order_q4_2]
plot_ly(x =~colnames(p_reord_q4), y =~rownames(p_reord_q4),
    z = ~p_reord_q4, type = "heatmap", 
    colors = colorRamp(c("black","red"))
  ) %>%
  layout(title =  "Heatmap of prices and earnings (Euclid dist- TSP)",
         xaxis = list(title = "Price-Earnings Indicators", zeroline = FALSE),
         yaxis = list(title = "Cities", zeroline = FALSE)
  )

From visual comparison of the heatmap produced by HC solver and TSP, we prefer the latter because there appears to be a separation along the anti diagonal such that cities that have similary high prices are placed along this diagonal. In the heatmap by the TSP solver this distinctionn is not quite clear.

# function creterion to compare unordered distance and ordered
# distance = p_e_rdist (row distance)
# or = order

# set seed 
set.seed(222)

or1 <- seriate(p_e_rdist, method = "OLO")

set.seed(333)

or2 <- seriate(p_e_rdist, method = "TSP")

result1 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or1 ))

result2 <- rbind(unordered = criterion(p_e_rdist), ordered = criterion(p_e_rdist,or2 ))
# criterion on HC solver and unordered
result1
##                2SUM AR_deviations AR_events      BAR      Cor_R
## unordered 1012004.5     107139.67     61656 29259.38 0.04268063
## ordered    756120.2      20296.15     26876 19055.56 0.20713342
##           Gradient_raw Gradient_weighted  Inertia Lazy_path_length
## unordered        -4032         -11081.08 17886336        10126.431
## ordered          65528         156151.06 24666913         3713.804
##           Least_squares        LS       ME Moore_stress Neumann_stress
## unordered       3575435 1006126.8 568.2673     986.9925       553.9889
## ordered         3352459  894638.7 652.4429     411.5392       239.0233
##           Path_length      RGAR
## unordered    281.7269 0.5169014
## ordered      121.9671 0.2253186
# criterion on TSP and unordered.
result2
##                2SUM AR_deviations AR_events      BAR      Cor_R
## unordered 1012004.5     107139.67     61656 29259.38 0.04268063
## ordered    838955.9      47893.87     40089 20181.60 0.11204948
##           Gradient_raw Gradient_weighted  Inertia Lazy_path_length
## unordered        -4032         -11081.08 17886336        10126.431
## ordered          39102          89163.24 21209322         4540.624
##           Least_squares        LS       ME Moore_stress Neumann_stress
## unordered       3575435 1006126.8 568.2673     986.9925       553.9889
## ordered         3441776  939297.2 653.2224     413.0466       237.8996
##           Path_length      RGAR
## unordered    281.7269 0.5169014
## ordered      119.0998 0.3360915

TSP solver has shorter path length (121.718) compared to HC solver (121.967) by a very small margin of .249. Thus it does a slighlty better job of optimizing the Hamiltonian Path Length. For measure of effectiveness (ME) the HC solver (ME = 652.44) has advantage over the TSP (ME = 651.598). A higher ME implies a better arrangement of the dissimilarity matrix. For gradient measures since objective is to increase the distance from main diagonal, we infer that HC (65528) does a better job of achieving this compared to TSP solver(41312).

Question 5.

# parallel coordinates plot from unsorted scaled data 

p_e_sc2 <- as.data.frame(p_e_sc)

p_e_sc2 <- round(p_e_sc2, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
  dimensions = list(
    list(label = "Food.Costs...", values = ~Food.Costs...),
    list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
    list(label = "Clothing.Index", values = ~Clothing.Index),
    list(label = "Hours.Worked", values = ~Hours.Worked),
    list(label = "Wage.Net", values = ~Wage.Net),
    list(label = "Vacation.Days", values = ~Vacation.Days),
    list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
    list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
    list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
    list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
  )
)
# adding a factored column by iphone column which defines the clusters
p_e_sc2$clust <-ifelse(p_e_sc2$iPhone.4S.hr. < -0.5, 0, 1)
p_e_sc2 %>% plot_ly(type ="parcoords",
  line = list(color = ~clust, colorscale = list(c(0, "red"), c(1, "blue"))),
  dimensions = list(
    list(label = "Food.Costs...", values = ~Food.Costs...),
    list(label = "iPhone.4S.hr.", values = ~iPhone.4S.hr.),
    list(label = "Clothing.Index", values = ~Clothing.Index),
    list(label = "Hours.Worked", values = ~Hours.Worked),
    list(label = "Wage.Net", values = ~Wage.Net),
    list(label = "Vacation.Days", values = ~Vacation.Days),
    list(label = "Big.Mac.min.", values = ~Big.Mac.min.),
    list(label = "Bread.kg.in.min.", values = ~Bread.kg.in.min.),
    list(label = "Rice.kg.in.min.", values = ~Rice.kg.in.min.),
    list(label = "Goods.and.Services...", values = ~Goods.and.Services...)
  )
)

We can identify two clusters defined by Wage net (blue) and iphone 4s (red). Wage net has values greater than 0 in the red cluster (defined by iphone 4) while iphone has values has values greater than -0.5 in the blue cluster. These clusters are difficult to interpret. The most prominent outlier in the blue cluster is the price of Rice per kg, The lines at this variable are furthest apart . It does not seem to fit into any of the two clusters.

# get the rownames to a column
p_ord_HC <- as.data.frame(p_reord)
p_ord_HC$name <- rownames(p_ord_HC)
# colapse the p_ord_HC to long format
p_ord_HC %>%
  tidyr::gather(variable, value, -name, factor_key = T) %>%
  arrange(name) %>%
  ggplot(aes(x=variable, y=value, group=name)) + 
  geom_polygon(fill="#F81894") + 
  coord_polar() + theme_classic() + 
  facet_wrap(~ name) #+ 

  #theme(axis.text.x = element_text(size = 5))

Cairo and Delhi are outliers with respect to the price of bread per kg. Prague, Johanessburg and Panama form a cluster the prices are comparatively small and similar across the products.. The most distinct outlier would be Caracas. The prices aååear higher due to the size of the radar chart.

Question 7.

Among Heatmaps, paralled coordinates and radar charts, heatmaps are relatively easier to interprate in terms of time and accuracy. Radar cahrts are hard to interprate when the number of variables to compare a re high. Parallel coordiantes are generally messy to work with.